{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction to DataFrames\n",
    "**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "using DataFrames"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Possible pitfalls"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Know what is copied when creating a `DataFrame`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>x1</th><th>x2</th><th>x3</th><th>x4</th><th>x5</th></tr></thead><tbody><tr><th>1</th><td>0.738683</td><td>0.575963</td><td>0.221199</td><td>0.961925</td><td>0.171644</td></tr><tr><th>2</th><td>0.247084</td><td>0.0548955</td><td>0.842629</td><td>0.358732</td><td>0.849723</td></tr><tr><th>3</th><td>0.428472</td><td>0.677116</td><td>0.712318</td><td>0.780728</td><td>0.0472096</td></tr></tbody></table>"
      ],
      "text/plain": [
       "3×5 DataFrames.DataFrame\n",
       "│ Row │ x1       │ x2        │ x3       │ x4       │ x5        │\n",
       "├─────┼──────────┼───────────┼──────────┼──────────┼───────────┤\n",
       "│ 1   │ 0.738683 │ 0.575963  │ 0.221199 │ 0.961925 │ 0.171644  │\n",
       "│ 2   │ 0.247084 │ 0.0548955 │ 0.842629 │ 0.358732 │ 0.849723  │\n",
       "│ 3   │ 0.428472 │ 0.677116  │ 0.712318 │ 0.780728 │ 0.0472096 │"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = DataFrame(rand(3, 5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "true"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y = DataFrame(x)\n",
    "x === y # no copyinng performed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "false"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y = copy(x)\n",
    "x === y # not the same object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "true"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all(x[i] === y[i] for i in ncol(x)) # but the columns are the same"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>x</th><th>y</th></tr></thead><tbody><tr><th>1</th><td>1</td><td>1</td></tr><tr><th>2</th><td>2</td><td>2</td></tr><tr><th>3</th><td>3</td><td>3</td></tr></tbody></table>"
      ],
      "text/plain": [
       "3×2 DataFrames.DataFrame\n",
       "│ Row │ x │ y │\n",
       "├─────┼───┼───┤\n",
       "│ 1   │ 1 │ 1 │\n",
       "│ 2   │ 2 │ 2 │\n",
       "│ 3   │ 3 │ 3 │"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = 1:3; y = [1, 2, 3]; df = DataFrame(x=x,y=y) # the same when creating arrays or assigning columns, except ranges"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "true"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y === df[:y] # the same object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(UnitRange{Int64}, Array{Int64,1})"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "typeof(x), typeof(df[:x]) # range is converted to a vector"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Do not modify the parent of `GroupedDataFrame`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrames.GroupedDataFrame  2 groups with keys: Symbol[:id]\n",
       "First Group:\n",
       "3×2 DataFrames.SubDataFrame{Array{Int64,1}}\n",
       "│ Row │ id │ x │\n",
       "├─────┼────┼───┤\n",
       "│ 1   │ 1  │ 1 │\n",
       "│ 2   │ 1  │ 3 │\n",
       "│ 3   │ 1  │ 5 │\n",
       "⋮\n",
       "Last Group:\n",
       "3×2 DataFrames.SubDataFrame{Array{Int64,1}}\n",
       "│ Row │ id │ x │\n",
       "├─────┼────┼───┤\n",
       "│ 1   │ 2  │ 2 │\n",
       "│ 2   │ 2  │ 4 │\n",
       "│ 3   │ 2  │ 6 │"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = DataFrame(id=repeat([1,2], outer=3), x=1:6)\n",
    "g = groupby(x, :id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrames.GroupedDataFrame  2 groups with keys: Symbol[:id]\n",
       "First Group:\n",
       "3×2 DataFrames.SubDataFrame{Array{Int64,1}}\n",
       "│ Row │ id │ x │\n",
       "├─────┼────┼───┤\n",
       "│ 1   │ 2  │ 1 │\n",
       "│ 2   │ 2  │ 3 │\n",
       "│ 3   │ 1  │ 5 │\n",
       "⋮\n",
       "Last Group:\n",
       "3×2 DataFrames.SubDataFrame{Array{Int64,1}}\n",
       "│ Row │ id │ x │\n",
       "├─────┼────┼───┤\n",
       "│ 1   │ 2  │ 2 │\n",
       "│ 2   │ 2  │ 4 │\n",
       "│ 3   │ 2  │ 6 │"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x[1:3, 1]=[2,2,2]\n",
    "g # well - it is wrong now, g is only a view"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Remember that you can filter columns of a `DataFrame` using booleans"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>x1</th><th>x2</th><th>x3</th><th>x4</th><th>x5</th></tr></thead><tbody><tr><th>1</th><td>0.236033</td><td>0.210968</td><td>0.555751</td><td>0.209472</td><td>0.0769509</td></tr><tr><th>2</th><td>0.346517</td><td>0.951916</td><td>0.437108</td><td>0.251379</td><td>0.640396</td></tr><tr><th>3</th><td>0.312707</td><td>0.999905</td><td>0.424718</td><td>0.0203749</td><td>0.873544</td></tr><tr><th>4</th><td>0.00790928</td><td>0.251662</td><td>0.773223</td><td>0.287702</td><td>0.278582</td></tr><tr><th>5</th><td>0.488613</td><td>0.986666</td><td>0.28119</td><td>0.859512</td><td>0.751313</td></tr></tbody></table>"
      ],
      "text/plain": [
       "5×5 DataFrames.DataFrame\n",
       "│ Row │ x1         │ x2       │ x3       │ x4        │ x5        │\n",
       "├─────┼────────────┼──────────┼──────────┼───────────┼───────────┤\n",
       "│ 1   │ 0.236033   │ 0.210968 │ 0.555751 │ 0.209472  │ 0.0769509 │\n",
       "│ 2   │ 0.346517   │ 0.951916 │ 0.437108 │ 0.251379  │ 0.640396  │\n",
       "│ 3   │ 0.312707   │ 0.999905 │ 0.424718 │ 0.0203749 │ 0.873544  │\n",
       "│ 4   │ 0.00790928 │ 0.251662 │ 0.773223 │ 0.287702  │ 0.278582  │\n",
       "│ 5   │ 0.488613   │ 0.986666 │ 0.28119  │ 0.859512  │ 0.751313  │"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "srand(1)\n",
    "x = DataFrame(rand(5, 5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>x1</th><th>x4</th></tr></thead><tbody><tr><th>1</th><td>0.236033</td><td>0.209472</td></tr><tr><th>2</th><td>0.346517</td><td>0.251379</td></tr><tr><th>3</th><td>0.312707</td><td>0.0203749</td></tr><tr><th>4</th><td>0.00790928</td><td>0.287702</td></tr><tr><th>5</th><td>0.488613</td><td>0.859512</td></tr></tbody></table>"
      ],
      "text/plain": [
       "5×2 DataFrames.DataFrame\n",
       "│ Row │ x1         │ x4        │\n",
       "├─────┼────────────┼───────────┤\n",
       "│ 1   │ 0.236033   │ 0.209472  │\n",
       "│ 2   │ 0.346517   │ 0.251379  │\n",
       "│ 3   │ 0.312707   │ 0.0203749 │\n",
       "│ 4   │ 0.00790928 │ 0.287702  │\n",
       "│ 5   │ 0.488613   │ 0.859512  │"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x[x[:x1] .< 0.25] # well - we have filtered columns not rows by accident as you can select columns using booleans"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>x1</th><th>x2</th><th>x3</th><th>x4</th><th>x5</th></tr></thead><tbody><tr><th>1</th><td>0.236033</td><td>0.210968</td><td>0.555751</td><td>0.209472</td><td>0.0769509</td></tr><tr><th>2</th><td>0.00790928</td><td>0.251662</td><td>0.773223</td><td>0.287702</td><td>0.278582</td></tr></tbody></table>"
      ],
      "text/plain": [
       "2×5 DataFrames.DataFrame\n",
       "│ Row │ x1         │ x2       │ x3       │ x4       │ x5        │\n",
       "├─────┼────────────┼──────────┼──────────┼──────────┼───────────┤\n",
       "│ 1   │ 0.236033   │ 0.210968 │ 0.555751 │ 0.209472 │ 0.0769509 │\n",
       "│ 2   │ 0.00790928 │ 0.251662 │ 0.773223 │ 0.287702 │ 0.278582  │"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x[x[:x1] .< 0.25, :] # probably this is what we wanted"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Column selection for DataFrame creates aliases unless explicitly copied"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>a</th><th>b</th><th>c</th><th>d</th><th>e</th></tr></thead><tbody><tr><th>1</th><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><th>2</th><td>2</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><th>3</th><td>3</td><td>3</td><td>3</td><td>3</td><td>3</td></tr></tbody></table>"
      ],
      "text/plain": [
       "3×5 DataFrames.DataFrame\n",
       "│ Row │ a │ b │ c │ d │ e │\n",
       "├─────┼───┼───┼───┼───┼───┤\n",
       "│ 1   │ 1 │ 1 │ 1 │ 1 │ 1 │\n",
       "│ 2   │ 2 │ 2 │ 2 │ 2 │ 2 │\n",
       "│ 3   │ 3 │ 3 │ 3 │ 3 │ 3 │"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<table class=\"data-frame\"><thead><tr><th></th><th>a</th><th>b</th><th>c</th><th>d</th><th>e</th></tr></thead><tbody><tr><th>1</th><td>100</td><td>100</td><td>100</td><td>1</td><td>1</td></tr><tr><th>2</th><td>2</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><th>3</th><td>3</td><td>3</td><td>3</td><td>3</td><td>3</td></tr></tbody></table>"
      ],
      "text/plain": [
       "3×5 DataFrames.DataFrame\n",
       "│ Row │ a   │ b   │ c   │ d │ e │\n",
       "├─────┼─────┼─────┼─────┼───┼───┤\n",
       "│ 1   │ 100 │ 100 │ 100 │ 1 │ 1 │\n",
       "│ 2   │ 2   │ 2   │ 2   │ 2 │ 2 │\n",
       "│ 3   │ 3   │ 3   │ 3   │ 3 │ 3 │"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "x = DataFrame(a=1:3)\n",
    "x[:b] = x[1] # alias\n",
    "x[:c] = x[:, 1] # also alias\n",
    "x[:d] = x[1][:] # copy\n",
    "x[:e] = copy(x[1]) # explicit copy\n",
    "display(x)\n",
    "x[1,1] = 100\n",
    "display(x)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Julia 0.6.2",
   "language": "julia",
   "name": "julia-0.6"
  },
  "language_info": {
   "file_extension": ".jl",
   "mimetype": "application/julia",
   "name": "julia",
   "version": "0.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
